1 Introduction

1.1 Import packages

library(pROC)
library(ggpubr)
library(Biostrings)
## Warning: package 'BiocGenerics' was built under R version 4.0.5
library(data.table)
library(dplyr)
library(purrr)
library(tidyverse)
library(yardstick)
library(doParallel)
library(foreach)
library(stringdist)
library(caret)

1.2 useful Functions

# Function to calculate the F score, given beta, precision and recall.
 calculate_f_beta = function(beta, precision, recall) {
   return((beta^2 + 1)*(precision*recall / ((beta^2)*precision + recall)))
 }
# Function to provide a closest match. Used to match HLA Alleles across mixed output styles.
ClosestMatch2 = function(string, stringVector){

  stringVector[amatch(string, stringVector, maxDist=Inf)]
}

1.3 Import data for testing

  • These data have been filtered as written in the main manuscript.
FullDataset=readRDS("CoV2_testing_dataset_filtered.rds")

2 Test the models OTB

2.1 IEDB Model

2.1.1 Run

  • Below is the code to test the models against these data.
  • Model is included in “IEDB_Immunogenicity_Model_Calis” folder.
  • Peptides are tested in a ‘per allele’ manner.
TEST_DATA_LOCATION="SARS_COV_2_DATASET/IEDB_OTB/"
# For each allele in the data
foreach(allele_i=1:length(unique(FullDataset$HLA_Allele)))%do%{
# Clean the allele text
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
# Write respective data to file
    testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_test_data.txt")
    write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Define the output file for the predictions
    RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_Results.txt")
# Run the model for allele x.
  system(paste0("python IEDB_Immunogenicity_Model_Calis/immunogenicity_model/predict_immunogenicity.py ",testdata,
              " --allele=",HLA_ALLELE_FOR_TESTING," > ",RESULTS_OUTPUT))
}


2.1.2 Read

  • Below reads in the output from executing the model.
TEST_DATA_LOCATION="SARS_COV_2_DATASET/IEDB_OTB/"
files <- dir(TEST_DATA_LOCATION, pattern = "*_Results")
# Read in all the Results files.
data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
           ~ fread(file.path(TEST_DATA_LOCATION, .)))
  )
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
IEDB_RESULTS <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Files are output from IEDB per allele and saved with the allele in the file name. Allele is not output in the data, so the below extracts allele information into a column from the file name
IEDB_RESULTS$file=gsub(x=IEDB_RESULTS$file,pattern="_Results.txt",replacement="")
IEDB_RESULTS=IEDB_RESULTS %>% mutate(HLA_Allele = gsub(x=IEDB_RESULTS$file,pattern="Allele_",replacement = ""))
# Map the allele in model output to allele nomenclature in our test dataset
IEDB_RESULTS$HLA_Allele = ClosestMatch2(IEDB_RESULTS$HLA_Allele,unique(FullDataset$HLA_Allele))
# Clean the data table and join it with the original full dataset
IEDB_RESULTS=IEDB_RESULTS %>% dplyr::rename(Peptide=peptide, ImmunogenicityScore=score) %>% select(!file)
IEDB_RESULTS=IEDB_RESULTS %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele")) %>% select(!length) %>% mutate(Dataset = "IEDB")

print(c("IEDB Results has same number of rows as test dataset? : ",IEDB_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "IEDB Results has same number of rows as test dataset? : "
## [2] "TRUE"

2.2 NetTepi

2.2.1 Run

  • The below requires NetTepi to be installed in folder /Applications/netTepi-1.0_orig folder (macOS)
  • Seperates peptides by HLA. Outputs analysis into /NETTEPI_OTB folder.

TEST_DATA_LOCATION="SARS_COV_2_DATASET/NETTEPI_OTB/"
# For each allele
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
# Clean the allele text: can't create a file with * or : in the name.
  HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":|\\*",replacement = "")

    testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_test_data.txt")
# Filter the data for the run
  data_for_run = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
# What lengths are used on this run?
  lengths = data_for_run %>% mutate(Length=Biostrings::width(Peptide))%>% pull(Length)%>% unique
# Write out data for this run
write.table(data_for_run %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_Results.xls")
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")

  system(paste0("/Applications/netTepi-1.0_orig/netTepi -a ",HLA_ALLELE_FOR_TESTING," -p ",testdata," -xlsfile ",RESULTS_OUTPUT," -l ",paste0(lengths,collapse = ",")))
# Change .xls extension to .csv
  system(paste0("mv ",RESULTS_OUTPUT," " ,gsub(x=RESULTS_OUTPUT,pattern=".xls",replacement=""),".csv"))

}

2.2.2 read in output

  • Below reads in the output from executing the model.
# Read output files in and combine them
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETTEPI_OTB"
files <- dir(TEST_DATA_LOCATION, pattern = "_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
           ~ fread(file.path(TEST_DATA_LOCATION, .)))
  )
# Get rid of columns we dont need.
NetTepi_Results <- unnest(data3) %>% as.data.table  %>% dplyr::select(!c(Pos,Identity,Aff,Stab,Tcell,`%Rank`,file))
# Combined score becomes the 'immunogenicity score'.
NetTepi_Results=NetTepi_Results %>% dplyr::rename(ImmunogenicityScore=Comb,HLA_Allele=Allele) #Use the 'combined' score for analysis
#Allele formatting as input and output is different, so we match the alleles between NetTepi output and our test dataset by similarity.
NetTepi_Results$HLA_Allele = ClosestMatch2(NetTepi_Results$HLA_Allele,unique(FullDataset$HLA_Allele))

# Below confirms that the above matching works as otherwise the correct number of rows would not be joined by peptide-HLA
NetTepi_Results=NetTepi_Results %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele"))
print(c("NetTepi Results has same number of rows as test dataset? : ",NetTepi_Results %>% nrow == FullDataset %>% nrow))
## [1] "NetTepi Results has same number of rows as test dataset? : "
## [2] "TRUE"
NetTepi_Results=NetTepi_Results%>% select(!Epitopes) %>% mutate(Dataset = 'NETTEPI')

2.3 iPred

2.3.1 Run OTB

  • The below .R script is taken from Shughay’s repo (https://github.com/antigenomics/ipred)
  • Modifications are made only to the last three lines of the .R script, which is done to employ the trained model to predict P(immunogenicity) for our CoV2 peptides of interest.

# Write data to file and run below script to train model OTB and process it
FullDataset %>% select(Peptide) %>% write.table(file="COV2_peptides_for_OTB_analysis.txt",quote=F,row.names = F)
source("run_ipred_OTB_CoV2.R")

2.3.2 read

  • Below reads in the output from executing the model.
IPRED_RESULTS=fread("SARS_COV_2_DATASET/IPRED_OTB/IPRED_RESULTS.txt")%>%dplyr::rename(Peptide=antigen.epitope,ImmunogenicityScore=imm.prob)
IPRED_RESULTS=IPRED_RESULTS %>% inner_join(FullDataset)
## Joining, by = "Peptide"
print(c("IPRED Results has same number of rows as test dataset? : ",IPRED_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "IPRED Results has same number of rows as test dataset? : "
## [2] "TRUE"
IPRED_RESULTS=IPRED_RESULTS %>% mutate(Dataset = "IPRED")

##Repitope - Output data in a format readable by Repitope

FullDataset %>% dplyr::rename(MHC=HLA_Allele) %>% mutate(Dataset = "CoV2_Eps") %>% write.csv(file = "CoV2_TestData_ForRepitope.csv",quote=F,row.names = F)

2.3.3 Compute features, train MHC-I model and make predictions

  • The below can take several hours to complete. I would suggest running each script individually to reduce the chance of issues with memory. I tend to use RScript via the command line.
# Compute the features for our test dataset and write them to FST file
#source("compute_repitope_features.R")

#source("run_repitope_OTB.R")


2.3.4 REpitope: Read predictions

  • Read in the predictions.
REPITOPE_RESULTS = fread("SARS_COV_2_DATASET/REPITOPE_OTB/ALL_PREDICTIONS.csv")
print(c("REPITOPE Results has same number of rows as test dataset? : ",REPITOPE_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "REPITOPE Results has same number of rows as test dataset? : "
## [2] "FALSE"
REPITOPE_RESULTS=REPITOPE_RESULTS %>% full_join(FullDataset) %>% select(!"ImmunogenicityScore.cv")
REPITOPE_RESULTS= REPITOPE_RESULTS %>% mutate(Dataset = "REPITOPE")

print(c("REPITOPE Results has same number of rows as test dataset? : ",REPITOPE_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "REPITOPE Results has same number of rows as test dataset? : "
## [2] "TRUE"

2.4 NetMHCpan EL

2.4.1 Run netmhcpan 4.0

  • Processes data per allele.
  • EL output, score on scale 0-1.
# For each allele
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_L/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
# Clean HLA and find lengths of test peptides
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")
    LENGTHS=FullDataset %>% mutate(Length = nchar(Peptide)) %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique

  testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
# Write test peptides to file for reading into netMHCpan
  write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
    RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
    system(paste0("/Applications/netMHCpan-4.0/netMHCpan -p ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))

}

2.4.2 Read in and process

# Read in and process
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_L/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract allele from file name
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
# Map HLA allele nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
# Join with test dataset
Netmhcpanres=Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))%>% inner_join( FullDataset)
## Joining, by = c("Peptide", "HLA_Allele")
Netmhcpanres %>% nrow
## [1] 878
# Munge the data table
Netmhcpanres=Netmhcpanres %>% select(Peptide, HLA_Allele,Immunogenicity,Ave,ImmunogenicityCont) %>% dplyr::rename(ImmunogenicityScore = Ave)%>% mutate(Dataset = "netMHCpan_EL")

2.5 NetMHCpan BA

2.5.1 Run netmhcpan

  • BA output rather than EL
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_BA/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern="\\*",replacement = "")


  LENGTHS=FullDataset %>% mutate(Length = nchar(Peptide)) %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique

  testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
  write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
  # Run model
  RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")

    system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))

}
TEST_DATA_LOCATION = "SARS_COV_2_DATASET/NETMHCPAN_4_BA/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

NetmhcpanresBA <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
NetmhcpanresBA=NetmhcpanresBA %>% mutate(HLA_Allele = gsub(x=NetmhcpanresBA$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))

NetmhcpanresBA$HLA_Allele = ClosestMatch2(NetmhcpanresBA$HLA_Allele,unique(FullDataset$HLA_Allele))
NetmhcpanresBA=NetmhcpanresBA %>% inner_join( FullDataset)
## Joining, by = c("Peptide", "HLA_Allele")
NetmhcpanresBA=NetmhcpanresBA %>% select(Peptide, HLA_Allele,Immunogenicity,Ave, ImmunogenicityCont) %>% dplyr::rename(ImmunogenicityScore = Ave)%>% mutate(Dataset = "netMHCpan_BA")

2.6 PRIME

2.6.1 Run


TEST_DATA_LOCATION="SARS_COV_2_DATASET/PRIMEOTB/"
# Model apparently can't deal with spaces in full path so save data to ~Documents/PRIMEDATA/x

for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
  HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":|\\*||-|HLA",replacement = "")

  testdata=paste0("~/Documents/PRIMEDATA/",HLA_ALLELE_FOR_TESTING,".txt")

  data_for_run = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
  write.table(data_for_run %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
  # Run model
  RESULTS_OUTPUT = paste0("~/Documents/PRIMEDATA/",HLA_ALLELE_FOR_TESTING,"Results.txt")

  system(paste0("/Applications/PRIME-master/PRIME -i ",testdata," -a ",HLA_ALLELE_FOR_TESTING," -mix /Applications/MixMHCpred-2.1/MixMHCpred"," -o ", RESULTS_OUTPUT))
}

2.6.2 read

TEST_DATA_LOCATION = "SARS_COV_2_DATASET/PRIME_OTB"
files <- dir(TEST_DATA_LOCATION, pattern = "Results.txt")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
           ~ fread(file.path(TEST_DATA_LOCATION, .)))
  )

PRIME_RESULTS <- unnest(data3) %>% as.data.table %>% select(Peptide,BestAllele,Score_bestAllele)

PRIME_RESULTS=PRIME_RESULTS %>% dplyr::rename(ImmunogenicityScore=Score_bestAllele,HLA_Allele=BestAllele)
PRIME_RESULTS$HLA_Allele = ClosestMatch2(PRIME_RESULTS$HLA_Allele, unique(FullDataset$HLA_Allele))

PRIME_RESULTS=PRIME_RESULTS %>% inner_join(FullDataset,by=c("Peptide","HLA_Allele"))
print(c("PRIME_RESULTS Results has same number of rows as test dataset? : ",PRIME_RESULTS %>% nrow == FullDataset %>% nrow))
## [1] "PRIME_RESULTS Results has same number of rows as test dataset? : "
## [2] "TRUE"
PRIME_RESULTS=PRIME_RESULTS %>% mutate(Dataset = "PRIME")

2.7 DeepImmuno

  • Can only process peptides of lengths only 9 and 10. We have pre-filtered for these lengths to compile the test dataset so does not affect the number of CoV2 peptides here.
  • We were unable to compile model locally, so instead we used the webserver and read the results in.
# Output the data for use on the webserver.
FullDataset %>% mutate(Length =  width(Peptide)) %>% filter(Length %in% c(9,10)) %>% select(Peptide, HLA_Allele) %>% mutate(HLA_Allele = gsub("\\:","",HLA_Allele)) %>% readr::write_csv(file="OUT_DEEPIMMUNO.csv",col_names = FALSE)
# Read in the webserver results
DEEPIMM=fread("SARS_COV_2_DATASET/DEEPIMMUNO_OTB/result.txt") %>% dplyr::rename(Peptide =peptide, HLA_Allele=HLA,ImmunogenicityScore=immunogenicity)
# Match the HLAs
DEEPIMM$HLA_Allele = ClosestMatch2(DEEPIMM$HLA_Allele,unique(FullDataset$HLA_Allele))
DEEPIMM %>% nrow
## [1] 878
DEEPIMM=DEEPIMM %>% inner_join(FullDataset)%>%mutate(Dataset = "DeepImmuno")
## Joining, by = c("Peptide", "HLA_Allele")

3 GAO predictor

3.1 read

TEST_DATA_LOCATION= "SARS_COV_2_DATASET/GAO_OTB"
files <- dir(TEST_DATA_LOCATION, pattern = ".csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
           ~ fread(file.path(TEST_DATA_LOCATION, .)))
  )

GAO_RESULTS <- unnest(data3) %>% as.data.table
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
FullDataset %>% nrow
## [1] 878
FullDataset %>% left_join(GAO_RESULTS %>% dplyr::rename(Peptide=peptide, HLA_Allele=HLA)) %>% nrow
## Joining, by = c("Peptide", "HLA_Allele")
## [1] 878
GAO_RESULTS=FullDataset %>% left_join(GAO_RESULTS %>% dplyr::rename(Peptide=peptide, HLA_Allele=HLA))
## Joining, by = c("Peptide", "HLA_Allele")
GAO_RESULTS=GAO_RESULTS %>% select(!file) %>% dplyr::rename(ImmunogenicityScore=amplitude) %>% select(!immunogenic)%>% mutate(Dataset = "GAO")

4 Results

4.1 Combine all data into DT ‘combinedData’

  • Confirm each model (Dataset column) has 878 obs each
# Bind all the model results together
combinedData = rbind(IEDB_RESULTS,NetTepi_Results,IPRED_RESULTS,REPITOPE_RESULTS,PRIME_RESULTS, Netmhcpanres,NetmhcpanresBA,GAO_RESULTS,DEEPIMM)

ALLOWEDLENGTHS = c(9,10) # Does not filter anything in this setting.
combinedData = combinedData%>% mutate(Length = width(Peptide)) %>% filter(Length %in% ALLOWEDLENGTHS)%>% select(!Length)

combinedData %>% nrow
## [1] 7902
# Confirm all have 878 obs
combinedData %>% select(Dataset) %>% table
## .
##   DeepImmuno          GAO         IEDB        IPRED      NETTEPI        PRIME 
##          878          878          878          878          878          878 
##     REPITOPE netMHCpan_BA netMHCpan_EL 
##          878          878          878

4.2 Create ROC-Curves

AUCDF = combinedData %>% group_by(Dataset) %>% dplyr::summarise(ROC=as.numeric(roc(Immunogenicity ~ ImmunogenicityScore)$auc))
# use 'roc' function from pROC to generate roc curves for each model
NETTEPIAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'NETTEPI'))
IPREDAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IPRED'))
IEDBMODELAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IEDB'))
REPITOPE_AUC_CV=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'REPITOPE'))
PRIME_AUC_CV =  roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'PRIME'))
DEEP_IMM_AUC =  roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'DeepImmuno'))
NETMHCPAN_IMM_AUC = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'netMHCpan_EL'))
NETMHCPAN_IMM_BA_AUC = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'netMHCpan_BA'))
GAO_AUC = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'GAO'))
# Use GGROC to combine and visualise the ROC-AUC curves
roc_AUC=ggroc(list(IEDB_Model=IEDBMODELAUC,iPred=IPREDAUC,NetTepi=NETTEPIAUC,REpitope=REPITOPE_AUC_CV,PRIME=PRIME_AUC_CV,DeepImmuno=DEEP_IMM_AUC,netMHCpan_EL=NETMHCPAN_IMM_AUC,netMHCpan_BA=NETMHCPAN_IMM_BA_AUC,GAO=GAO_AUC),legacy.axes = TRUE,size=1.25) + theme_bw() +
  annotate("size"=4,"text",x=.80,y=.19,label=paste0("IEDB_Model:     ",round(auc(IEDBMODELAUC),digits=3),"\n","iPred:     ",round(auc(IPREDAUC),digits=3),"\n","NetTepi: ",round(auc(NETTEPIAUC),digits=3),"\n","REpitope: ",round(auc(REPITOPE_AUC_CV),digits=3), "\n","PRIME: ",round(auc(PRIME_AUC_CV),digits = 3), "\n","DeepImmuno: ", round(auc(DEEP_IMM_AUC),digits = 3),  "\n","netMHCpan_EL: ", round(auc(NETMHCPAN_IMM_AUC),digits = 3),  "\n","netMHCpan_BA: ", round(auc(NETMHCPAN_IMM_BA_AUC),digits = 3),  "\n","GAO: ", round(auc(GAO_AUC),digits = 3))) + font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=14)+ geom_abline(size=1,intercept = 0, slope = 1,color = "darkgrey", linetype = "dashed")+theme(panel.background = element_rect(colour = "black", size=0.5))+ coord_fixed(xlim = 0:1, ylim = 0:1)#+ggtitle("ROC Curves")

4.3 Produce PR-AUC

# Set factor levels
combinedData$Immunogenicity = factor(combinedData$Immunogenicity,levels = c("Positive","Negative"))
# Create 'DATA_FOR_PR'. This is for plotting. We modify the model name labels and colour labels to ensure consistency between ROC-AUC and PR-AUC plots
DATA_FOR_PR = combinedData

#Calculate the real praucs
PR_AUC_COMBINED=combinedData %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)
PR_AUC_COMBINED$.estimate=round(PR_AUC_COMBINED$.estimate,digits=3)

# Change model labels to ensure consistency between PR-AUC and ROC-AUC
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'IEDB',]$Dataset = "IEDB_Model"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'IPRED',]$Dataset = "iPred"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'NETTEPI',]$Dataset = "NetTepi"
DATA_FOR_PR[DATA_FOR_PR$Dataset == 'REPITOPE',]$Dataset = "REpitope"

# Produce and plot PR-AUC curve
pr_AUC=DATA_FOR_PR %>% group_by(Dataset) %>%
  mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% pr_curve(Immunogenicity,ImmunogenicityScore) %>%
  autoplot() + aes(size = Dataset)+scale_size_manual(values=c(1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25)) +annotate("size"=4,"text",x=.8,y=.19,label=paste0("IEDB_Model: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'IEDB')%>%pull(".estimate"),"\n","iPred: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'IPRED')%>%pull(".estimate"),"\n","NetTepi: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'NETTEPI')%>%pull(".estimate"),"\n","REpitope: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'REPITOPE')%>%pull(".estimate"),"\n","PRIME: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'PRIME')%>%pull(".estimate"),"\n","DeepImmuno: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'DeepImmuno')%>%pull(".estimate"),"\n","netMHCpan_EL: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'netMHCpan_EL')%>%pull(".estimate"),"\n","netMHCpan_BA: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'netMHCpan_BA')%>%pull(".estimate"),"\n","GAO: ",PR_AUC_COMBINED %>% filter(Dataset %in% 'GAO')%>%pull(".estimate") )) + geom_hline(size=1,color="darkgrey",yintercept = nrow(FullDataset[FullDataset$Immunogenicity=='Positive',]) / nrow(FullDataset),linetype="dashed")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=14)+theme(panel.background = element_rect(colour = "black", size=0.5))+ coord_fixed(xlim = 0:1, ylim = 0:1)

4.4 Figure 1A-B

#use cowplot to organise the plots
GRID1= cowplot::plot_grid(roc_AUC)
GRID2= cowplot::plot_grid(pr_AUC)

cowplot::plot_grid(GRID1,GRID2,align="hv")

5 ROC-AUC Bootstrap analysis

set.seed(41)
DATA_FOR_ROC=DATA_FOR_PR
#setup parallel backend to use multiple processors
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

ROC_AUC_RAND.DIST=foreach(i = 1:1000, .combine = rbind,.packages = c("dplyr","magrittr","pROC","data.table")) %dopar% {
   DATA_FOR_ROC %>%  group_by(Dataset)%>%
  mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO")))  %>% mutate(Shuffled_Immunogenicity=sample(size=n(),Immunogenicity))  %>% dplyr::summarise(ROC=as.numeric(roc(Shuffled_Immunogenicity ~ ImmunogenicityScore)$auc)) %>% mutate(sampleNum=i)
   }

stopCluster(cl)


ROC_AUC_COMBINED=DATA_FOR_ROC%>%
  mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% group_by(Dataset)  %>% dplyr::summarise(ROC=as.numeric(roc(Immunogenicity ~ ImmunogenicityScore)$auc))


ROC_RAND_DIST_FIG=ROC_AUC_RAND.DIST%>%
  mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO")))  %>% ggdensity(x="ROC",y="..density..",fill="Dataset",add="mean",color = "Dataset",alpha=0.3)  +theme_pubr(base_size = 18)+ facet_wrap(~Dataset) + geom_vline(data=ROC_AUC_COMBINED,aes(xintercept=ROC),color="black",linetype="dashed") + xlab("Area under the ROC curve") + theme(legend.position = "none")+rotate_x_text(angle=90)
ROC_RAND_DIST_FIG

5.1 Table to show z-scores wrt the bootstrap ROC-AUC analysis

ROC_AUC_RAND.DIST %>% group_by(Dataset) %>% dplyr::summarise(Random_mean=mean(ROC),sd=sd(ROC)) %>% inner_join(ROC_AUC_COMBINED) %>% dplyr::rename(Predicted=ROC) %>% mutate(zscore = round(((Predicted-Random_mean)/sd),2) ) %>% DT::datatable(caption="Mean, sd and zscore to show distance from mean of random distribution")
## Joining, by = "Dataset"

6 PR-AUC Bootstrap analysis

#setup parallel backend to use multiple processors
set.seed(41)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

 PR_AUC_RAND.DIST=foreach(i = 1:1000, .combine = rbind,.packages = c("dplyr","magrittr","yardstick","data.table")) %dopar% {
    DATA_FOR_PR %>% group_by(Dataset) %>%
  mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% mutate(Shuffled_Immunogenicity=sample(size=n(),Immunogenicity)) %>% mutate(Shuffled_Immunogenicity=factor(Shuffled_Immunogenicity,levels = c("Positive","Negative"))) %>%
     pr_auc(Shuffled_Immunogenicity,ImmunogenicityScore) %>% mutate(sampleNum=i)
   }

stopCluster(cl)


PR_AUC_COMBINED=DATA_FOR_PR %>% mutate(Immunogenicity=factor(Immunogenicity,levels = c("Positive","Negative"))) %>%
  mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)


PR_RAND_DIST_FIG=PR_AUC_RAND.DIST %>%
  mutate(Dataset = factor(Dataset, levels = c("IEDB_Model","iPred","NetTepi","REpitope","PRIME","DeepImmuno","netMHCpan_EL","netMHCpan_BA","GAO"))) %>% ggdensity(x=".estimate",y="..density..",fill="Dataset",add="mean",color = "Dataset",alpha=0.3)+theme_pubr(base_size = 18)  + facet_wrap(~Dataset) + geom_vline(data=PR_AUC_COMBINED,aes(xintercept=.estimate),color="black",linetype="dashed") + xlab("Area under the precision-recall curve") + theme(legend.position = "none")+rotate_x_text(angle=90)
PR_RAND_DIST_FIG

cowplot::plot_grid(ROC_RAND_DIST_FIG, PR_RAND_DIST_FIG, nrow=1,align="hv",axis="bt")

PR_AUC_RAND.DIST %>% group_by(Dataset) %>% dplyr::summarise(Random_mean=mean(.estimate),sd=sd(.estimate)) %>% inner_join(PR_AUC_COMBINED %>% select(!c(.metric,.estimator))) %>% dplyr::rename(Predicted=.estimate) %>% mutate(zscore = round(((Predicted-Random_mean)/sd),2) ) %>% DT::datatable(caption="Mean, sd and zscore to show distance from mean of random distribution")
## Joining, by = "Dataset"

7 Compute and visualise confusion matrices

foreach(i = 1:length(unique(combinedData$Dataset)))%do% {
    MODEL = unique(combinedData$Dataset)[i]
    ROC_MODEL=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% MODEL))
    threshold =   coords(roc=ROC_MODEL, x="best", input="threshold", best.method="youden", transpose=F)$threshold
    threshold_data = combinedData %>% filter(Dataset %in% MODEL)%>% mutate(ImmunogenicityPrediction = ifelse(ImmunogenicityScore > threshold, "Positive","Negative"))
    CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(threshold_data  %>% select(Immunogenicity) %>% pull(),
                       levels = c("Negative","Positive")),
                       factor(threshold_data %>% select(ImmunogenicityPrediction) %>% pull(),
                       levels=c("Negative","Positive")))

  table=data.frame(CM$table)

plotTable <- table %>%
  mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


CMplot=ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1,size=8) +
  scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
  theme_bw() +
  xlim(rev(levels(table$Reference))) + ggtitle(MODEL)+ font("xy.text",size=18,color="black")+ font("xlab",size=18,color="black")+ font("ylab",size=18,color="black") + theme(plot.title = element_text(size=18))+ font("legend.text",size=12)

#plot(CMplot)
}
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls < cases
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]